Mount GDrive

In [ ]:
# Mount Google Drive
from google.colab import drive # import drive from google colab

ROOT = "/content/drive/"     # default location for the drive
print(ROOT)                 # print content of ROOT (Optional)

drive.mount(ROOT)           # we mount the google drive at /content/drive

#%cd 'drive'
%pwd
/content/drive/
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/
Out[ ]:
'/content'
In [ ]:
!ls

Set Path

In [3]:
# import join used to join ROOT path and MY_GOOGLE_DRIVE_PATH
from os.path import join  
import os
ROOT = "/content/drive/" 
# path to your project on Google Drive
MY_GOOGLE_DRIVE_PATH = 'My Drive/Colab/CapstoneProject/' 
train_folder = 'Data/stage_2_train_images/'
saved_folder = 'Saved_Data/'

PROJECT_PATH = join(ROOT, MY_GOOGLE_DRIVE_PATH)
TRAIN_PATH = join(PROJECT_PATH, train_folder)
SAVE_PATH = join(PROJECT_PATH, saved_folder)

# It's good to print out the value if you are not sure 
print("PROJECT_PATH: ", PROJECT_PATH)  
print("TRAIN_PATH: ", TRAIN_PATH) 
print("SAVE_PATH: ", SAVE_PATH) 
PROJECT_PATH:  /content/drive/My Drive/Colab/CapstoneProject/
TRAIN_PATH:  /content/drive/My Drive/Colab/CapstoneProject/Data/stage_2_train_images/
SAVE_PATH:  /content/drive/My Drive/Colab/CapstoneProject/Saved_Data/

Kaggle Download

In [ ]:
# Kaggle Download
if(True):
  !pip uninstall -y kaggle
  !pip install --upgrade pip
  !pip install kaggle==1.5.6
  !kaggle -v

  #List Kaggle DataSets
  #!kaggle datasets list
  #List Kaggle Competitions
  #!kaggle competitions list
  #List Competitions with string
  !kaggle competitions list -s pneumonia

  #KAGGLE_DIR = '/content/drive/My Drive/Colab/Kaggle/'
  KAGGLE_DIR = '/content/sample_data/Kaggle/'
  %cd {KAGGLE_DIR}
  !ls
  competition_name = 'rsna-pneumonia-detection-challenge'
  !mkdir {competition_name}
  DOWNLOAD_DIR = KAGGLE_DIR+competition_name+'/'
  print(DOWNLOAD_DIR)

  import os
  os.environ['KAGGLE_CONFIG_DIR'] = KAGGLE_DIR

  !kaggle competitions download -c {competition_name}
  print('Download Complete')
Found existing installation: kaggle 1.5.6
Uninstalling kaggle-1.5.6:
  Successfully uninstalled kaggle-1.5.6
Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (20.1.1)
Processing /root/.cache/pip/wheels/01/3e/ff/77407ebac3ef71a79b9166a8382aecf88415a0bcbe3c095a01/kaggle-1.5.6-py3-none-any.whl
Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (1.12.0)
Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (2.8.1)
Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (4.0.0)
Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (1.24.3)
Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (2020.4.5.2)
Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (4.41.1)
Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kaggle==1.5.6) (2.23.0)
Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify->kaggle==1.5.6) (1.3)
Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle==1.5.6) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->kaggle==1.5.6) (2.9)
Installing collected packages: kaggle
Successfully installed kaggle-1.5.6
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.6/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.6/dist-packages/kaggle/api/kaggle_api_extended.py", line 149, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /content/sample_data/Kaggle. Or use the environment method.
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.6/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.6/dist-packages/kaggle/api/kaggle_api_extended.py", line 149, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /content/sample_data/Kaggle. Or use the environment method.
/content/sample_data/Kaggle
/content/sample_data/Kaggle/rsna-pneumonia-detection-challenge/
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.6/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.6/dist-packages/kaggle/api/kaggle_api_extended.py", line 149, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /content/sample_data/Kaggle/. Or use the environment method.
Download Complete

Unzip

In [ ]:
zip_file = '/content/drive/My Drive/Colab/CapstoneProject/Data/rsna-pneumonia-detection-challenge.zip'
unzip_folder = '/content/drive/My Drive/Colab/CapstoneProject/Data/'
if(False): # Have set it to false to avoid it from running again
    !unzip {zip_file} -d {unzip_folder}
    print('unzipping complete')

Read Data

In [ ]:
%cd {PROJECT_PATH}
%cd 'Data'
import pandas as pd

class_info = pd.read_csv('stage_2_detailed_class_info.csv')
print(class_info.shape)
print(class_info.head())

train_labels = pd.read_csv('stage_2_train_labels.csv')
print(train_labels.shape)
print(train_labels.head())

# Total 30227 observations
/content/drive/My Drive/Colab/CapstoneProject
/content/drive/My Drive/Colab/CapstoneProject/Data
(30227, 2)
                              patientId                         class
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6  No Lung Opacity / Not Normal
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd  No Lung Opacity / Not Normal
2  00322d4d-1c29-4943-afc9-b6754be640eb  No Lung Opacity / Not Normal
3  003d8fa0-6bf1-40ed-b54c-ac657f8495c5                        Normal
4  00436515-870c-4b36-a041-de91049b9ab4                  Lung Opacity
(30227, 6)
                              patientId      x      y  width  height  Target
0  0004cfab-14fd-4e49-80ba-63a80b6bddd6    NaN    NaN    NaN     NaN       0
1  00313ee0-9eaa-42f4-b0ab-c148ed3241cd    NaN    NaN    NaN     NaN       0
2  00322d4d-1c29-4943-afc9-b6754be640eb    NaN    NaN    NaN     NaN       0
3  003d8fa0-6bf1-40ed-b54c-ac657f8495c5    NaN    NaN    NaN     NaN       0
4  00436515-870c-4b36-a041-de91049b9ab4  264.0  152.0  213.0   379.0       1
In [ ]:
#Data Cleanup
def check_data(data_file):
  print('\nIs NA:\n',data_file.isna().sum())
  print('\nUnique Patients:\n',len(data_file['patientId'].unique()))
  # print(train_labels.isna().sum())
  # print(data_file[data_file['Target'] == 1].isna().sum())

#lot of labels are missing. use target =1 only
check_data(class_info)
check_data(train_labels)
#total 26k unique patients are present
# 20672 entries do not have x,y in it
Is NA:
 patientId    0
class        0
dtype: int64

Unique Patients:
 26684

Is NA:
 patientId        0
x            20672
y            20672
width        20672
height       20672
Target           0
dtype: int64

Unique Patients:
 26684
In [ ]:
# Merging the data in two csv into one.
class_info_train_labels_merge = train_labels.merge(class_info, left_on='patientId', right_on='patientId', how='inner')
class_info_train_labels_merge.head()
Out[ ]:
patientId x y width height Target class
0 0004cfab-14fd-4e49-80ba-63a80b6bddd6 NaN NaN NaN NaN 0 No Lung Opacity / Not Normal
1 00313ee0-9eaa-42f4-b0ab-c148ed3241cd NaN NaN NaN NaN 0 No Lung Opacity / Not Normal
2 00322d4d-1c29-4943-afc9-b6754be640eb NaN NaN NaN NaN 0 No Lung Opacity / Not Normal
3 003d8fa0-6bf1-40ed-b54c-ac657f8495c5 NaN NaN NaN NaN 0 Normal
4 00436515-870c-4b36-a041-de91049b9ab4 264.0 152.0 213.0 379.0 1 Lung Opacity
In [ ]:
check_data(class_info_train_labels_merge)
#20672 rows do not have X and Y info.
Is NA:
 patientId        0
x            20672
y            20672
width        20672
height       20672
Target           0
class            0
dtype: int64

Unique Patients:
 26684
In [ ]:
print(class_info_train_labels_merge[class_info_train_labels_merge['Target'] == 0].isna().sum())
#For target =1 we have values in all the columns.
patientId        0
x            20672
y            20672
width        20672
height       20672
Target           0
class            0
dtype: int64
In [ ]:
print(class_info_train_labels_merge[class_info_train_labels_merge['class'] == 'Normal'].isna().sum())
# 8851 missing values for normal class 

print(class_info_train_labels_merge[class_info_train_labels_merge['class'] == 'No Lung Opacity / Not Normal'].isna().sum())
# 11821 missing values, total of 20672 is maintained

print(class_info_train_labels_merge[class_info_train_labels_merge['class'] == 'Lung Opacity'].isna().sum())
# no missing values
patientId       0
x            8851
y            8851
width        8851
height       8851
Target          0
class           0
dtype: int64
patientId        0
x            11821
y            11821
width        11821
height       11821
Target           0
class            0
dtype: int64
patientId    0
x            0
y            0
width        0
height       0
Target       0
class        0
dtype: int64
In [ ]:
class_info_train_labels_merge.groupby('class')['Target'].unique()
# All lung opacity has target=1 and no other class has target 1. so all pneumonia cases are with lung opacity and target=1
Out[ ]:
class
Lung Opacity                    [1]
No Lung Opacity / Not Normal    [0]
Normal                          [0]
Name: Target, dtype: object
In [ ]:
#Check distribution
import seaborn as sns
import matplotlib.pyplot as plt
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [ ]:
sns.set_style("dark")
sns.countplot(train_labels['Target'])
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc405c33550>

Read DICOM Images

In [ ]:
#Set Project Path

%cd {PROJECT_PATH}
%cd 'Data/stage_2_train_images'
/content/drive/My Drive/Colab/CapstoneProject
/content/drive/My Drive/Colab/CapstoneProject/Data/stage_2_train_images
In [ ]:
#Load Dependencies
!pip install pydicom
import pydicom

import matplotlib.pyplot as plt
Collecting pydicom
  Downloading https://files.pythonhosted.org/packages/d3/56/342e1f8ce5afe63bf65c23d0b2c1cd5a05600caad1c211c39725d3a4cc56/pydicom-2.0.0-py3-none-any.whl (35.4MB)
     |████████████████████████████████| 35.5MB 89kB/s 
Installing collected packages: pydicom
Successfully installed pydicom-2.0.0
In [ ]:
# Function to load an image
def load_image(imagename):
  image1 = pydicom.dcmread(imagename)
  print(type(image1))
  return image1
In [ ]:
#Check sample images
fig, axes = plt.subplots(nrows=1, ncols=3 ,figsize=(20,15))
axes[0].set_title('No Lung Opacity')
axes[0].imshow(load_image('00322d4d-1c29-4943-afc9-b6754be640eb.dcm').pixel_array, cmap=plt.cm.bone)
axes[1].set_title('Normal')
axes[1].imshow(load_image('003d8fa0-6bf1-40ed-b54c-ac657f8495c5.dcm').pixel_array, cmap=plt.cm.bone)
axes[2].set_title('Lung Opacity')
axes[2].imshow(load_image('00436515-870c-4b36-a041-de91049b9ab4.dcm').pixel_array, cmap=plt.cm.bone)
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
Out[ ]:
<matplotlib.image.AxesImage at 0x7fc4050b3630>
In [ ]:
image1 = pydicom.dcmread('00436515-870c-4b36-a041-de91049b9ab4.dcm').pixel_array
image1.shape

# Images are of dimension 1024x1024 which would too large for our network. 
# I would reduce the dimension during model preparation to improve speed of training
Out[ ]:
(1024, 1024)

Read DICOM data

In [ ]:
#Check data within a sample file
dcm_data = pydicom.read_file('00322d4d-1c29-4943-afc9-b6754be640eb.dcm')

print(dcm_data)
Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 202
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: Secondary Capture Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.276.0.7230010.3.1.4.8323329.11252.1517874357.450548
(0002, 0010) Transfer Syntax UID                 UI: JPEG Baseline (Process 1)
(0002, 0012) Implementation Class UID            UI: 1.2.276.0.7230010.3.0.3.6.0
(0002, 0013) Implementation Version Name         SH: 'OFFIS_DCMTK_360'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.276.0.7230010.3.1.4.8323329.11252.1517874357.450548
(0008, 0020) Study Date                          DA: '19010101'
(0008, 0030) Study Time                          TM: '000000.00'
(0008, 0050) Accession Number                    SH: ''
(0008, 0060) Modality                            CS: 'CR'
(0008, 0064) Conversion Type                     CS: 'WSD'
(0008, 0090) Referring Physician's Name          PN: ''
(0008, 103e) Series Description                  LO: 'view: AP'
(0010, 0010) Patient's Name                      PN: '00322d4d-1c29-4943-afc9-b6754be640eb'
(0010, 0020) Patient ID                          LO: '00322d4d-1c29-4943-afc9-b6754be640eb'
(0010, 0030) Patient's Birth Date                DA: ''
(0010, 0040) Patient's Sex                       CS: 'M'
(0010, 1010) Patient's Age                       AS: '19'
(0018, 0015) Body Part Examined                  CS: 'CHEST'
(0018, 5101) View Position                       CS: 'AP'
(0020, 000d) Study Instance UID                  UI: 1.2.276.0.7230010.3.1.2.8323329.11252.1517874357.450547
(0020, 000e) Series Instance UID                 UI: 1.2.276.0.7230010.3.1.3.8323329.11252.1517874357.450546
(0020, 0010) Study ID                            SH: ''
(0020, 0011) Series Number                       IS: "1"
(0020, 0013) Instance Number                     IS: "1"
(0020, 0020) Patient Orientation                 CS: ''
(0028, 0002) Samples per Pixel                   US: 1
(0028, 0004) Photometric Interpretation          CS: 'MONOCHROME2'
(0028, 0010) Rows                                US: 1024
(0028, 0011) Columns                             US: 1024
(0028, 0030) Pixel Spacing                       DS: [0.168, 0.168]
(0028, 0100) Bits Allocated                      US: 8
(0028, 0101) Bits Stored                         US: 8
(0028, 0102) High Bit                            US: 7
(0028, 0103) Pixel Representation                US: 0
(0028, 2110) Lossy Image Compression             CS: '01'
(0028, 2114) Lossy Image Compression Method      CS: 'ISO_10918_1'
(7fe0, 0010) Pixel Data                          OB: Array of 124270 elements
In [ ]:
# Function to fetch dicom data from files
# We need to create a combined file with all the parameters in one place.
vars = ['Modality', 'PatientAge', 'PatientSex', 'BodyPartExamined', 'ViewPosition', 'ConversionType', 'Rows', 'Columns', 'PixelSpacing']

def get_DICOM_metadata(class_info): 
  if(os.path.isfile(PROJECT_PATH+'Data/class_info_train_labels_merge_metadata.csv')):
    class_info = pd.read_csv(PROJECT_PATH+'Data/class_info_train_labels_merge_metadata.csv')
    class_info.drop([class_info.columns[0]], axis=1, inplace=True)
  else:
    #get file list from folder
    image_train_path = os.listdir(PROJECT_PATH+'Data/stage_2_train_images')
    image_test_path = os.listdir(PROJECT_PATH+'Data/stage_2_test_images')
    print("Number of images in train set:", len(image_train_path),"\nNumber of images in test set:", len(image_test_path))

    vars = ['Modality', 'PatientAge', 'PatientSex', 'BodyPartExamined', 'ViewPosition', 'ConversionType', 'Rows', 'Columns', 'PixelSpacing']

    for var in vars:
          class_info[var] = None
    #iterate on files and append metadata
    for i,filename in enumerate(image_train_path):
      #print('processing:',i,filename)
      dcm_data = pydicom.read_file(PROJECT_PATH+'Data/stage_2_train_images/'+filename)
      idx = (class_info['patientId']==dcm_data.PatientID)
      class_info.loc[idx,'Modality'] = dcm_data.Modality
      class_info.loc[idx,'PatientAge'] = pd.to_numeric(dcm_data.PatientAge)
      class_info.loc[idx,'PatientSex'] = dcm_data.PatientSex
      class_info.loc[idx,'BodyPartExamined'] = dcm_data.BodyPartExamined
      class_info.loc[idx,'ViewPosition'] = dcm_data.ViewPosition
      class_info.loc[idx,'ConversionType'] = dcm_data.ConversionType
      class_info.loc[idx,'Rows'] = dcm_data.Rows
      class_info.loc[idx,'Columns'] = dcm_data.Columns  
      class_info.loc[idx,'PixelSpacing'] = str.format("{:4.3f}",dcm_data.PixelSpacing[0]) 
    #save to a file for future and dont run the function if the file exists
    class_info.to_csv(PROJECT_PATH+'Data/class_info_train_labels_merge_metadata.csv')
  
  class_info['xc'] = class_info['x'] + class_info['width'] / 2
  class_info['yc'] = class_info['y'] + class_info['height'] / 2

  return class_info

class_info_train_labels_merge_metadata = get_DICOM_metadata(class_info_train_labels_merge)

class_info_train_labels_merge_metadata.head()
Out[ ]:
patientId x y width height Target class Modality PatientAge PatientSex BodyPartExamined ViewPosition ConversionType Rows Columns PixelSpacing xc yc
0 0004cfab-14fd-4e49-80ba-63a80b6bddd6 NaN NaN NaN NaN 0 No Lung Opacity / Not Normal CR 51 F CHEST PA WSD 1024 1024 0.143 NaN NaN
1 00313ee0-9eaa-42f4-b0ab-c148ed3241cd NaN NaN NaN NaN 0 No Lung Opacity / Not Normal CR 48 F CHEST PA WSD 1024 1024 0.194 NaN NaN
2 00322d4d-1c29-4943-afc9-b6754be640eb NaN NaN NaN NaN 0 No Lung Opacity / Not Normal CR 19 M CHEST AP WSD 1024 1024 0.168 NaN NaN
3 003d8fa0-6bf1-40ed-b54c-ac657f8495c5 NaN NaN NaN NaN 0 Normal CR 28 M CHEST PA WSD 1024 1024 0.143 NaN NaN
4 00436515-870c-4b36-a041-de91049b9ab4 264.0 152.0 213.0 379.0 1 Lung Opacity CR 32 F CHEST AP WSD 1024 1024 0.139 370.5 341.5
In [ ]:
# Check improper values in dataset
def check_data_set(class_info):
  print(class_info.isna().sum())
  #print(train_labels[train_labels['Target'] == 1].isna().sum())
check_data_set(class_info)
patientId    0
class        0
dtype: int64
In [ ]:
# Draw graphs from the data
def drawgraphs(data_file,columns,hue=False,width =15,showdistribution=True):
    print('Creating graph for X axis:'," and Y axis:",columns)  
    length=len(columns)*6  
    total = float(len(data_file))

    fig, axes = plt.subplots(nrows=len(columns) if len(columns)>1 else 1,ncols=1,figsize=(width, length) )    
    for index,content in enumerate(columns):
      plt.title(content)

      currentaxes = 0
      if(len(columns)>1):
        currentaxes = axes[index]
      else: 
        currentaxes = axes 
      
      if(hue):
        sns.countplot(x=columns[index],data=data_file,ax=currentaxes, hue=hue)
               
      else:      
        sns.countplot(x=columns[index],data=data_file,ax=currentaxes)

      if(showdistribution):
        for p in (currentaxes.patches):
            height = p.get_height()
            if(height>0 and total>0):
              currentaxes.text(p.get_x()+p.get_width()/2., height + 3, '{:1.2f}%'.format(100*height/total), ha="center")     
    
    return True
In [ ]:
drawgraphs(data_file= class_info_train_labels_merge_metadata,columns= class_info_train_labels_merge_metadata.columns[5:16],hue= 'class')
#Inference: 
# - All lung opacity is in Target=1 only
# - Patient age has a distribution needs to be bucketized to get clearer picture
# - Higher %age of males have this prob
# - View position AP has higher lung opacity than PA and % of normal is also lower in AP
# - Coversion type, rows columns, body part examined, modality has only 1 value so its not useful
# - Pixel spacing 0.168 has higher lung opacity.
Creating graph for X axis:  and Y axis: Index(['Target', 'class', 'Modality', 'PatientAge', 'PatientSex',
       'BodyPartExamined', 'ViewPosition', 'ConversionType', 'Rows', 'Columns',
       'PixelSpacing'],
      dtype='object')
Out[ ]:
True
In [ ]:
drawgraphs(data_file= class_info_train_labels_merge_metadata, columns= ['PatientAge'], width =20, showdistribution=True)
Creating graph for X axis:  and Y axis: ['PatientAge']
Out[ ]:
True
In [ ]:
import numpy as np
custom_bucket_array = np.linspace(0, 160, 17)
custom_bucket_array

class_info_train_labels_merge_metadata['PatientAgeBucket'] =pd.cut(class_info_train_labels_merge_metadata['PatientAge'], custom_bucket_array)
class_info_train_labels_merge_metadata.head(1)
Out[ ]:
patientId x y width height Target class Modality PatientAge PatientSex BodyPartExamined ViewPosition ConversionType Rows Columns PixelSpacing xc yc PatientAgeBucket
0 0004cfab-14fd-4e49-80ba-63a80b6bddd6 NaN NaN NaN NaN 0 No Lung Opacity / Not Normal CR 51 F CHEST PA WSD 1024 1024 0.143 NaN NaN (50.0, 60.0]
In [ ]:
drawgraphs(data_file= class_info_train_labels_merge_metadata, columns= ['PatientAgeBucket'], width =20, showdistribution=True, hue='PatientSex')
# there are higher number of patients in 50-60 age group as a result changes of having a positive patient in this age group is higher.
Creating graph for X axis:  and Y axis: ['PatientAgeBucket']
Out[ ]:
True
In [ ]:
drawgraphs(data_file= class_info_train_labels_merge_metadata, columns= ['PatientSex'], width =10, showdistribution=True, hue='Target')
# There are higher number of records of male than female. Both for target 0 and 1.
Creating graph for X axis:  and Y axis: ['PatientSex']
Out[ ]:
True

Bounding box analysis

In [ ]:
class_info_train_labels_merge_metadata['xc'] = class_info_train_labels_merge_metadata['x'] + class_info_train_labels_merge_metadata['width'] / 2
class_info_train_labels_merge_metadata['yc'] = class_info_train_labels_merge_metadata['y'] + class_info_train_labels_merge_metadata['height'] / 2
class_info_train_labels_merge_metadata.head(1)
Out[ ]:
patientId x y width height Target class Modality PatientAge PatientSex BodyPartExamined ViewPosition ConversionType Rows Columns PixelSpacing xc yc PatientAgeBucket
0 0004cfab-14fd-4e49-80ba-63a80b6bddd6 NaN NaN NaN NaN 0 No Lung Opacity / Not Normal CR 51 F CHEST PA WSD 1024 1024 0.143 NaN NaN (50.0, 60.0]
In [ ]:
from matplotlib.patches import Rectangle

def plot_window(data,color_point, color_window,text):
    fig, ax = plt.subplots(1,1,figsize=(7,7))
    plt.title("Centers of Lung Opacity rectangles over rectangles\n{}".format(text))
    data.plot.scatter(x='xc', y='yc', xlim=(0,1024), ylim=(0,1024), ax=ax, alpha=0.8, marker=".", color=color_point)
    for i, crt_sample in data.iterrows():
        ax.add_patch(Rectangle(xy=(crt_sample['x'], crt_sample['y']),
            width=crt_sample['width'],height=crt_sample['height'],alpha=3.5e-3, color=color_window))
    plt.show()
In [ ]:
classify = (class_info_train_labels_merge_metadata['ViewPosition']=='AP') 

plot_window(class_info_train_labels_merge_metadata[ classify ],'green', 'yellow', 'Patient View Position: PA')
# Lung Opacities are present mostly in the central part.
In [ ]:
classify = (class_info_train_labels_merge_metadata['ViewPosition']=='PA') 

plot_window(class_info_train_labels_merge_metadata[ classify ],'blue', 'red', 'Patient View Position: PA')
# Distribution is slightly different for view position  PA
In [ ]:
classify = (class_info_train_labels_merge_metadata['ViewPosition']=='PA') & (class_info_train_labels_merge_metadata['PatientAgeBucket']==pd.Interval(50,60))

plot_window(class_info_train_labels_merge_metadata[ classify ],'blue', 'red', 'Patient View Position: PA')
# Checking the distribution for 50-60 age group.
In [ ]:
classify = (class_info_train_labels_merge_metadata['ViewPosition']=='PA') & (class_info_train_labels_merge_metadata['PatientAgeBucket']==pd.Interval(60,70))

plot_window(class_info_train_labels_merge_metadata[ classify ],'blue', 'red', 'Patient View Position: PA')

All Images

In [ ]:
def show_dicom_images_with_boxes(data):
    img_data = list(data.T.to_dict().values())
    f, ax = plt.subplots(3,3, figsize=(16,18))
    for i,data_row in enumerate(img_data):
        patientImage = data_row['patientId']+'.dcm'
        imagePath = os.path.join(PROJECT_PATH,"Data/stage_2_train_images/",patientImage)
        #print(imagePath)
        data_row_img_data = pydicom.read_file(imagePath)
        modality = data_row_img_data.Modality
        age = data_row_img_data.PatientAge
        sex = data_row_img_data.PatientSex
        data_row_img = load_image(imagePath)
        ax[i//3, i%3].imshow(data_row_img.pixel_array, cmap=plt.cm.bone) 
        ax[i//3, i%3].axis('off')
        ax[i//3, i%3].set_title('ID: {}\nModality: {} Age: {} Sex: {} Target: {}\nClass: {}'.format(
                data_row['patientId'],modality, age, sex, data_row['Target'], data_row['class']))
        rows = class_info_train_labels_merge_metadata[class_info_train_labels_merge_metadata['patientId']==data_row['patientId']]
        box_data = list(rows.T.to_dict().values())
        for j, row in enumerate(box_data):
            ax[i//3, i%3].add_patch(Rectangle(xy=(row['x'], row['y']),
                        width=row['width'],height=row['height'], 
                        color="yellow",alpha = 0.1))   
    plt.show()
In [ ]:
show_dicom_images_with_boxes(class_info_train_labels_merge_metadata[class_info_train_labels_merge_metadata['Target']==0].sample(9))
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
In [ ]:
show_dicom_images_with_boxes(class_info_train_labels_merge_metadata[class_info_train_labels_merge_metadata['Target']==1].sample(9))
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>
<class 'pydicom.dataset.FileDataset'>

CNN Model

Load Dependencies

In [ ]:
import os
import csv
import random
!pip install pydicom
import pydicom
import numpy as np
import pandas as pd
from skimage import io
from skimage import measure
from skimage.transform import resize

import tensorflow as tf
from tensorflow import keras

from matplotlib import pyplot as plt
import matplotlib.patches as patches
Collecting pydicom
  Downloading https://files.pythonhosted.org/packages/d3/56/342e1f8ce5afe63bf65c23d0b2c1cd5a05600caad1c211c39725d3a4cc56/pydicom-2.0.0-py3-none-any.whl (35.4MB)
     |████████████████████████████████| 35.5MB 98kB/s 
Installing collected packages: pydicom
Successfully installed pydicom-2.0.0
In [ ]:
# Copy from google drive to colab directory to speed up I/O
# !cp -r -v '/content/drive/My Drive/Colab/CapstoneProject/Data/stage_2_train_images/' '/content/sample_data/'

# Aborted this since it was taking too much time.

Split into train and validation

In [ ]:
#TRAIN_PATH = '/content/sample_data/stage_2_train_images/'

print(TRAIN_PATH)
filenames = {}

read_directory = False
if(read_directory):
  filenames = os.listdir(TRAIN_PATH)
  pd.DataFrame(filenames).to_csv(SAVE_PATH+'train_path_listdir.csv')
else:
  filenames=pd.read_csv(SAVE_PATH+'train_path_listdir.csv', usecols=[1],header=0).values.tolist()
  filenames = [val for sublist in filenames for val in sublist]

# Use part of the data for training earlier and then run for 100% of the data
percentage_data_used = 100
file_count = int(len(filenames)*percentage_data_used/100)
print("Total files available:",file_count)

random.shuffle(filenames)

# split into train and validation filenames
n_valid_samples = int(file_count * 0.3)

train_filenames = filenames[n_valid_samples:file_count]
valid_filenames = filenames[:n_valid_samples]
print('n train samples', len(train_filenames))
print('n valid samples', len(valid_filenames))
n_train_samples = len(filenames) - n_valid_samples

image_dimension = 128
print('Image Dimension to use:',image_dimension)
print('sample file:',filenames[0])
/content/drive/My Drive/Colab/CapstoneProject/Data/stage_2_train_images/
Total files available: 26684
n train samples 18679
n valid samples 8005
Image Dimension to use: 128
sample file: 4d6e7a87-c86a-499d-a383-911aee689f75.dcm

Check distribution of train and test vs original

In [ ]:
# Check dist of selected files based on the csv that was provided. 
# Did not try stratified sampling since the distribution is not impacted much.
def check_distribution(dataframe_to_check):
  filename_check = pd.DataFrame(columns=['patientId','class'])
  #get filename 
  for filename in dataframe_to_check:
    filename_check = filename_check.append(class_info[class_info['patientId'] == filename.split('.')[0]])
    
  print('Rows',len(filename_check))
  print('unique',len(filename_check['patientId'].unique()))
  print(filename_check['class'].value_counts(normalize = True))

check_distribution(train_filenames)
check_distribution(valid_filenames)
Rows 21242
unique 18679
No Lung Opacity / Not Normal    0.389135
Lung Opacity                    0.322333
Normal                          0.288532
Name: class, dtype: float64
Rows 8985
unique 8005
No Lung Opacity / Not Normal    0.395659
Normal                          0.302949
Lung Opacity                    0.301391
Name: class, dtype: float64
In [ ]:
check_distribution(filenames)
# Overall Distribution is very similar to the distribution in train and validation data set.
Rows 30227
unique 26684
No Lung Opacity / Not Normal    0.391074
Lung Opacity                    0.316108
Normal                          0.292818
Name: class, dtype: float64
In [ ]:
# identifying if there are any files beside dcm in the folder
for name in filenames:
  le = len(name)
  if(name[le-3:le] != 'dcm'):
    print(name)

Create a dictionary of pneumonia locations in one place.

In [ ]:
# empty dictionary
pneumonia_locations = {}
# load table
with open(os.path.join(PROJECT_PATH,'Data/stage_2_train_labels.csv'), mode='r') as infile:
    # open reader
    reader = csv.reader(infile)
    # skip header
    next(reader, None)
    # loop through rows
    for rows in reader:
        # retrieve information
        filename = rows[0]
        location = rows[1:5]
        pneumonia = rows[5]
        # if row contains pneumonia add label to dictionary
        # which contains a list of pneumonia locations per filename
        if pneumonia == '1':
            # convert string to float to int
            location = [int(float(i)) for i in location]
            # save pneumonia location in dictionary
            if filename in pneumonia_locations:
                pneumonia_locations[filename].append(location)
            else:
                pneumonia_locations[filename] = [location]

Generator class

In [ ]:
import keras

# The dataset is too large to fit into memory, so we need to create a generator that loads data on the fly.
# Generator class to handle:
# Image load from folder during train and predict modes, shuffle on epoc end, 
# resize loaded images, augment if needed, add trailing channel dimension
class generator(keras.utils.Sequence):
    
    def __init__(self, folder, filenames, pneumonia_locations=None, batch_size=32, image_size=image_dimension, shuffle=True, augment=False, predict=False):
        self.folder = folder
        self.filenames = filenames
        self.pneumonia_locations = pneumonia_locations
        self.batch_size = batch_size
        self.image_size = image_size
        self.shuffle = shuffle
        self.augment = augment
        self.predict = predict
        self.on_epoch_end()
        
    # Loads the file from folder, resizes and augments the data with horizontal flip    
    def __load__(self, filename):
        # load dicom file as numpy array
        #print('reading file:', filename)
        img = pydicom.dcmread(os.path.join(self.folder, filename), force=True).pixel_array

        # create empty mask
        msk = np.zeros(img.shape)
        # get filename without extension
        filename = filename.split('.')[0]
        # if image contains pneumonia
        if filename in self.pneumonia_locations:
            # loop through pneumonia
            for location in self.pneumonia_locations[filename]:
                # add 1's at the location of the pneumonia
                x, y, w, h = location
                msk[y:y+h, x:x+w] = 1
        # resize both image and mask
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        msk = resize(msk, (self.image_size, self.image_size), mode='reflect') > 0.5
        # if augment then horizontal flip half the time
        if self.augment and random.random() > 0.5:
            img = np.fliplr(img)
            msk = np.fliplr(msk)
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        msk = np.expand_dims(msk, -1)
        return img, msk
    
    # Loads images during prediction cycles
    def __loadpredict__(self, filename):
        # load dicom file as numpy array
        # print('reading file:', filename)
        img = pydicom.dcmread(os.path.join(self.folder, filename), force=True).pixel_array
        
        # resize image
        img = resize(img, (self.image_size, self.image_size), mode='reflect')
        # add trailing channel dimension
        img = np.expand_dims(img, -1)
        return img
        
    # Generator must implement this getter function    
    def __getitem__(self, index):
        # select batch
        filenames = self.filenames[index*self.batch_size:(index+1)*self.batch_size]
        # predict mode: return images and filenames
        if self.predict:
            # load files
            imgs = [self.__loadpredict__(filename) for filename in filenames]
            # create numpy batch
            imgs = np.array(imgs)
            return imgs, filenames
        # train mode: return images and masks
        else:
            # load files
            items = [self.__load__(filename) for filename in filenames]
            # unzip images and masks
            imgs, msks = zip(*items)
            # create numpy batch
            imgs = np.array(imgs)
            msks = np.array(msks)
            return imgs, msks

    # Shuffle data before start of next epoc    
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.filenames)
        
    def __len__(self):
        if self.predict:
            # return everything
            return int(np.ceil(len(self.filenames) / self.batch_size))
        else:
            # return full batches only
            return int(len(self.filenames) / self.batch_size)
Using TensorFlow backend.

Layers and Model Architecture

In [ ]:
# create 1 downsample layer, each containing 4 layers in it
def create_downsample(channels, inputs):
    x = keras.layers.BatchNormalization(momentum=0.9)(inputs)
    x = keras.layers.LeakyReLU(0)(x)
    x = keras.layers.Conv2D(channels, 1, padding='same', use_bias=False)(x)
    x = keras.layers.MaxPool2D(2)(x)
    return x

# creates 1 residual layer, each containing 6 layers in it.
def create_resblock(channels, inputs):
    x = keras.layers.BatchNormalization(momentum=0.9)(inputs)
    x = keras.layers.LeakyReLU(0)(x)
    x = keras.layers.Conv2D(channels, 3, padding='same', use_bias=False)(x)
    x = keras.layers.BatchNormalization(momentum=0.9)(x)
    x = keras.layers.LeakyReLU(0)(x)
    x = keras.layers.Conv2D(channels, 3, padding='same', use_bias=False)(x)
    return keras.layers.add([x, inputs])

# Model creator
# Depth = number of layers in the model
def create_network(input_size, channels, n_blocks=2, depth=4):
    # input layers - 2 layer
    inputs = keras.Input(shape=(input_size, input_size, 1))
    x = keras.layers.Conv2D(channels, 3, padding='same', use_bias=False)(inputs)
    # residual blocks (4*4 downsample + 4*2*6 resblock = 64 layers)
    for d in range(depth):
        channels = channels * 2
        x = create_downsample(channels, x)
        for b in range(n_blocks):
            x = create_resblock(channels, x)
    # output - 4 layers
    x = keras.layers.BatchNormalization(momentum=0.9)(x)
    x = keras.layers.LeakyReLU(0)(x)
    x = keras.layers.Conv2D(1, 1, activation='sigmoid')(x)
    outputs = keras.layers.UpSampling2D(2**depth)(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

Learning Functions

In [ ]:
# define iou or jaccard loss function
def iou_loss(y_true, y_pred):
    y_true = tf.reshape(y_true, [-1])
    y_pred = tf.reshape(y_pred, [-1])
    intersection = tf.reduce_sum(y_true * y_pred)
    score = (intersection + 1.) / (tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) - intersection + 1.)
    return 1 - score

# combine bce loss and iou loss
def iou_bce_loss(y_true, y_pred):
    return 0.5 * keras.losses.binary_crossentropy(y_true, y_pred) + 0.5 * iou_loss(y_true, y_pred)

# mean iou as a metric
def mean_iou(y_true, y_pred):
    y_pred = tf.round(y_pred)
    intersect = tf.reduce_sum(y_true * y_pred, axis=[1, 2, 3])
    union = tf.reduce_sum(y_true, axis=[1, 2, 3]) + tf.reduce_sum(y_pred, axis=[1, 2, 3])
    smooth = tf.ones(tf.shape(intersect))
    return tf.reduce_mean((intersect + smooth) / (union - intersect + smooth))

# create network and compiler
model = create_network(input_size=image_dimension, channels=32, n_blocks=2, depth=4)
model.compile(optimizer='adam',
              loss=iou_bce_loss,
              metrics=['accuracy', mean_iou])

model.summary()

# cosine learning rate annealing
# changes learning rate based on the number of epocs passed
def cosine_annealing(x):
    lr = 0.001
    epochs = 25
    return lr* (np.cos(np.pi*x/epochs)+1.) /2
learning_rate = tf.keras.callbacks.LearningRateScheduler(cosine_annealing)

# keeps logging the epoc output simultaneously while training
csv_logger = tf.keras.callbacks.CSVLogger(SAVE_PATH + 'logs_cnn_segment.csv', append = True)

# Creating checkpoint of the best model to avoid save errors later on.
# Saves training time once the best model is achieved.
cp = tf.keras.callbacks.ModelCheckpoint(filepath = SAVE_PATH + 'model_checkpoint.h5', verbose = 1, save_best_only = True)

# Keep monitoring val_loss to see if there is any improvement. 
# Mostly the model kept loss in a range so keeping patience as 4 to avoid bloating training time. 
# Any improvement of 0.5% in val_loss would get captured
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.005, patience=4, restore_best_weights=True, verbose=1, mode='auto')
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
input_1 (InputLayer)            (None, 128, 128, 1)  0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 128, 128, 32) 288         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 128, 128, 32) 128         conv2d_1[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 128, 128, 32) 0           batch_normalization_1[0][0]      
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 128, 128, 64) 2048        leaky_re_lu_1[0][0]              
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D)  (None, 64, 64, 64)   0           conv2d_2[0][0]                   
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 64, 64, 64)   256         max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
leaky_re_lu_2 (LeakyReLU)       (None, 64, 64, 64)   0           batch_normalization_2[0][0]      
__________________________________________________________________________________________________
conv2d_3 (Conv2D)               (None, 64, 64, 64)   36864       leaky_re_lu_2[0][0]              
__________________________________________________________________________________________________
batch_normalization_3 (BatchNor (None, 64, 64, 64)   256         conv2d_3[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_3 (LeakyReLU)       (None, 64, 64, 64)   0           batch_normalization_3[0][0]      
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 64, 64, 64)   36864       leaky_re_lu_3[0][0]              
__________________________________________________________________________________________________
add_1 (Add)                     (None, 64, 64, 64)   0           conv2d_4[0][0]                   
                                                                 max_pooling2d_1[0][0]            
__________________________________________________________________________________________________
batch_normalization_4 (BatchNor (None, 64, 64, 64)   256         add_1[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_4 (LeakyReLU)       (None, 64, 64, 64)   0           batch_normalization_4[0][0]      
__________________________________________________________________________________________________
conv2d_5 (Conv2D)               (None, 64, 64, 64)   36864       leaky_re_lu_4[0][0]              
__________________________________________________________________________________________________
batch_normalization_5 (BatchNor (None, 64, 64, 64)   256         conv2d_5[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_5 (LeakyReLU)       (None, 64, 64, 64)   0           batch_normalization_5[0][0]      
__________________________________________________________________________________________________
conv2d_6 (Conv2D)               (None, 64, 64, 64)   36864       leaky_re_lu_5[0][0]              
__________________________________________________________________________________________________
add_2 (Add)                     (None, 64, 64, 64)   0           conv2d_6[0][0]                   
                                                                 add_1[0][0]                      
__________________________________________________________________________________________________
batch_normalization_6 (BatchNor (None, 64, 64, 64)   256         add_2[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_6 (LeakyReLU)       (None, 64, 64, 64)   0           batch_normalization_6[0][0]      
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 64, 64, 128)  8192        leaky_re_lu_6[0][0]              
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D)  (None, 32, 32, 128)  0           conv2d_7[0][0]                   
__________________________________________________________________________________________________
batch_normalization_7 (BatchNor (None, 32, 32, 128)  512         max_pooling2d_2[0][0]            
__________________________________________________________________________________________________
leaky_re_lu_7 (LeakyReLU)       (None, 32, 32, 128)  0           batch_normalization_7[0][0]      
__________________________________________________________________________________________________
conv2d_8 (Conv2D)               (None, 32, 32, 128)  147456      leaky_re_lu_7[0][0]              
__________________________________________________________________________________________________
batch_normalization_8 (BatchNor (None, 32, 32, 128)  512         conv2d_8[0][0]                   
__________________________________________________________________________________________________
leaky_re_lu_8 (LeakyReLU)       (None, 32, 32, 128)  0           batch_normalization_8[0][0]      
__________________________________________________________________________________________________
conv2d_9 (Conv2D)               (None, 32, 32, 128)  147456      leaky_re_lu_8[0][0]              
__________________________________________________________________________________________________
add_3 (Add)                     (None, 32, 32, 128)  0           conv2d_9[0][0]                   
                                                                 max_pooling2d_2[0][0]            
__________________________________________________________________________________________________
batch_normalization_9 (BatchNor (None, 32, 32, 128)  512         add_3[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_9 (LeakyReLU)       (None, 32, 32, 128)  0           batch_normalization_9[0][0]      
__________________________________________________________________________________________________
conv2d_10 (Conv2D)              (None, 32, 32, 128)  147456      leaky_re_lu_9[0][0]              
__________________________________________________________________________________________________
batch_normalization_10 (BatchNo (None, 32, 32, 128)  512         conv2d_10[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_10 (LeakyReLU)      (None, 32, 32, 128)  0           batch_normalization_10[0][0]     
__________________________________________________________________________________________________
conv2d_11 (Conv2D)              (None, 32, 32, 128)  147456      leaky_re_lu_10[0][0]             
__________________________________________________________________________________________________
add_4 (Add)                     (None, 32, 32, 128)  0           conv2d_11[0][0]                  
                                                                 add_3[0][0]                      
__________________________________________________________________________________________________
batch_normalization_11 (BatchNo (None, 32, 32, 128)  512         add_4[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_11 (LeakyReLU)      (None, 32, 32, 128)  0           batch_normalization_11[0][0]     
__________________________________________________________________________________________________
conv2d_12 (Conv2D)              (None, 32, 32, 256)  32768       leaky_re_lu_11[0][0]             
__________________________________________________________________________________________________
max_pooling2d_3 (MaxPooling2D)  (None, 16, 16, 256)  0           conv2d_12[0][0]                  
__________________________________________________________________________________________________
batch_normalization_12 (BatchNo (None, 16, 16, 256)  1024        max_pooling2d_3[0][0]            
__________________________________________________________________________________________________
leaky_re_lu_12 (LeakyReLU)      (None, 16, 16, 256)  0           batch_normalization_12[0][0]     
__________________________________________________________________________________________________
conv2d_13 (Conv2D)              (None, 16, 16, 256)  589824      leaky_re_lu_12[0][0]             
__________________________________________________________________________________________________
batch_normalization_13 (BatchNo (None, 16, 16, 256)  1024        conv2d_13[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_13 (LeakyReLU)      (None, 16, 16, 256)  0           batch_normalization_13[0][0]     
__________________________________________________________________________________________________
conv2d_14 (Conv2D)              (None, 16, 16, 256)  589824      leaky_re_lu_13[0][0]             
__________________________________________________________________________________________________
add_5 (Add)                     (None, 16, 16, 256)  0           conv2d_14[0][0]                  
                                                                 max_pooling2d_3[0][0]            
__________________________________________________________________________________________________
batch_normalization_14 (BatchNo (None, 16, 16, 256)  1024        add_5[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_14 (LeakyReLU)      (None, 16, 16, 256)  0           batch_normalization_14[0][0]     
__________________________________________________________________________________________________
conv2d_15 (Conv2D)              (None, 16, 16, 256)  589824      leaky_re_lu_14[0][0]             
__________________________________________________________________________________________________
batch_normalization_15 (BatchNo (None, 16, 16, 256)  1024        conv2d_15[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_15 (LeakyReLU)      (None, 16, 16, 256)  0           batch_normalization_15[0][0]     
__________________________________________________________________________________________________
conv2d_16 (Conv2D)              (None, 16, 16, 256)  589824      leaky_re_lu_15[0][0]             
__________________________________________________________________________________________________
add_6 (Add)                     (None, 16, 16, 256)  0           conv2d_16[0][0]                  
                                                                 add_5[0][0]                      
__________________________________________________________________________________________________
batch_normalization_16 (BatchNo (None, 16, 16, 256)  1024        add_6[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_16 (LeakyReLU)      (None, 16, 16, 256)  0           batch_normalization_16[0][0]     
__________________________________________________________________________________________________
conv2d_17 (Conv2D)              (None, 16, 16, 512)  131072      leaky_re_lu_16[0][0]             
__________________________________________________________________________________________________
max_pooling2d_4 (MaxPooling2D)  (None, 8, 8, 512)    0           conv2d_17[0][0]                  
__________________________________________________________________________________________________
batch_normalization_17 (BatchNo (None, 8, 8, 512)    2048        max_pooling2d_4[0][0]            
__________________________________________________________________________________________________
leaky_re_lu_17 (LeakyReLU)      (None, 8, 8, 512)    0           batch_normalization_17[0][0]     
__________________________________________________________________________________________________
conv2d_18 (Conv2D)              (None, 8, 8, 512)    2359296     leaky_re_lu_17[0][0]             
__________________________________________________________________________________________________
batch_normalization_18 (BatchNo (None, 8, 8, 512)    2048        conv2d_18[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_18 (LeakyReLU)      (None, 8, 8, 512)    0           batch_normalization_18[0][0]     
__________________________________________________________________________________________________
conv2d_19 (Conv2D)              (None, 8, 8, 512)    2359296     leaky_re_lu_18[0][0]             
__________________________________________________________________________________________________
add_7 (Add)                     (None, 8, 8, 512)    0           conv2d_19[0][0]                  
                                                                 max_pooling2d_4[0][0]            
__________________________________________________________________________________________________
batch_normalization_19 (BatchNo (None, 8, 8, 512)    2048        add_7[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_19 (LeakyReLU)      (None, 8, 8, 512)    0           batch_normalization_19[0][0]     
__________________________________________________________________________________________________
conv2d_20 (Conv2D)              (None, 8, 8, 512)    2359296     leaky_re_lu_19[0][0]             
__________________________________________________________________________________________________
batch_normalization_20 (BatchNo (None, 8, 8, 512)    2048        conv2d_20[0][0]                  
__________________________________________________________________________________________________
leaky_re_lu_20 (LeakyReLU)      (None, 8, 8, 512)    0           batch_normalization_20[0][0]     
__________________________________________________________________________________________________
conv2d_21 (Conv2D)              (None, 8, 8, 512)    2359296     leaky_re_lu_20[0][0]             
__________________________________________________________________________________________________
add_8 (Add)                     (None, 8, 8, 512)    0           conv2d_21[0][0]                  
                                                                 add_7[0][0]                      
__________________________________________________________________________________________________
batch_normalization_21 (BatchNo (None, 8, 8, 512)    2048        add_8[0][0]                      
__________________________________________________________________________________________________
leaky_re_lu_21 (LeakyReLU)      (None, 8, 8, 512)    0           batch_normalization_21[0][0]     
__________________________________________________________________________________________________
conv2d_22 (Conv2D)              (None, 8, 8, 1)      513         leaky_re_lu_21[0][0]             
__________________________________________________________________________________________________
up_sampling2d_1 (UpSampling2D)  (None, 128, 128, 1)  0           conv2d_22[0][0]                  
==================================================================================================
Total params: 12,727,969
Trainable params: 12,718,305
Non-trainable params: 9,664
__________________________________________________________________________________________________

Model Training

In [ ]:
# create train and validation generators
train_gen = generator(TRAIN_PATH, train_filenames, pneumonia_locations, batch_size=32, image_size=image_dimension, shuffle=True, augment=True, predict=False)
valid_gen = generator(TRAIN_PATH, valid_filenames, pneumonia_locations, batch_size=32, image_size=image_dimension, shuffle=False, predict=False)

history = model.fit_generator(train_gen, validation_data=valid_gen, callbacks=[learning_rate,csv_logger,early_stopping], epochs=12, workers=4, use_multiprocessing=True)
Epoch 1/12
/usr/local/lib/python3.6/dist-packages/keras/utils/data_utils.py:616: UserWarning: The input 332 could not be retrieved. It could be because a worker has died.
  UserWarning)
582/583 [============================>.] - ETA: 4s - loss: 0.4865 - accuracy: 0.9622 - mean_iou: 0.6797
/usr/local/lib/python3.6/dist-packages/keras/utils/data_utils.py:616: UserWarning: The input 11 could not be retrieved. It could be because a worker has died.
  UserWarning)
583/583 [==============================] - 3981s 7s/step - loss: 0.4864 - accuracy: 0.9622 - mean_iou: 0.6798 - val_loss: 0.4036 - val_accuracy: 0.9684 - val_mean_iou: 0.7141
Epoch 2/12
583/583 [==============================] - 3083s 5s/step - loss: 0.4521 - accuracy: 0.9654 - mean_iou: 0.7046 - val_loss: 0.4168 - val_accuracy: 0.9698 - val_mean_iou: 0.6944
Epoch 3/12
583/583 [==============================] - 3020s 5s/step - loss: 0.4394 - accuracy: 0.9665 - mean_iou: 0.7146 - val_loss: 0.4320 - val_accuracy: 0.9690 - val_mean_iou: 0.6996
Epoch 4/12
583/583 [==============================] - 3059s 5s/step - loss: 0.4321 - accuracy: 0.9675 - mean_iou: 0.7232 - val_loss: 0.4456 - val_accuracy: 0.9730 - val_mean_iou: 0.7511
Epoch 5/12
583/583 [==============================] - 3085s 5s/step - loss: 0.4242 - accuracy: 0.9682 - mean_iou: 0.7253 - val_loss: 0.4538 - val_accuracy: 0.9752 - val_mean_iou: 0.7591
Epoch 6/12
583/583 [==============================] - 3084s 5s/step - loss: 0.4207 - accuracy: 0.9684 - mean_iou: 0.7287 - val_loss: 0.4048 - val_accuracy: 0.9720 - val_mean_iou: 0.7476
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping
model saved
dict_keys(['val_loss', 'val_accuracy', 'val_mean_iou', 'loss', 'accuracy', 'mean_iou', 'lr'])
history saved

Model Accuracy Check

In [ ]:
plt.figure(figsize=(25,6))
plt.subplot(131)
plt.plot(history.epoch, history.history["loss"], label="Train loss")
plt.plot(history.epoch, history.history["val_loss"], label="Valid loss")
plt.legend()
plt.subplot(132)
plt.plot(history.epoch, history.history["accuracy"], label="Train accuracy")
plt.plot(history.epoch, history.history["val_accuracy"], label="Valid accuracy")
plt.legend()
plt.subplot(133)
plt.plot(history.epoch, history.history["mean_iou"], label="Train iou")
plt.plot(history.epoch, history.history["val_mean_iou"], label="Valid iou")
plt.legend()
plt.show()

# Validation loss did not increase by more than 0.4036 even after 4 epocs hence early stopping was hit.
# Validation accuracy also peaked between 96-97%. and started going down later on showing the signs of overfitting after epoc #4

Visual Verification

In [ ]:
# Red is predicted mask, Blue is actual Mask
for imgs, msks in valid_gen:
    # predict batch of images
    preds = model.predict(imgs)
    # create figure
    f, axarr = plt.subplots(4, 8, figsize=(20,15))
    
    # Flatten the array
    axarr = axarr.ravel()
    axidx = 0
    
    # loop through batch
    for img, msk, pred in zip(imgs, msks, preds):
        # plot image
        axarr[axidx].imshow(img[:, :, 0])
        
        # threshold true mask
        comp = msk[:, :, 0] > 0.5
        # apply connected components
        comp = measure.label(comp)
        # apply bounding boxes
        predictionString = ''
        for region in measure.regionprops(comp):
            # retrieve x, y, height and width
            y, x, y2, x2 = region.bbox
            height = y2 - y
            width = x2 - x
            axarr[axidx].add_patch(patches.Rectangle((x,y),width,height,linewidth=2,edgecolor='b',facecolor='none'))
        
        # threshold predicted mask
        comp = pred[:, :, 0] > 0.5
        # apply connected components
        comp = measure.label(comp)
        # apply bounding boxes
        predictionString = ''
        for region in measure.regionprops(comp):
            # retrieve x, y, height and width
            y, x, y2, x2 = region.bbox
            height = y2 - y
            width = x2 - x
            axarr[axidx].add_patch(patches.Rectangle((x,y),width,height,linewidth=2,edgecolor='r',facecolor='none'))
        axidx += 1
    plt.show()
    # only plot one batch
    break

    # There was considerable IOU for the images where prediction was correct. 

conclusion

1.In our project we started with exploring the given dataset and we find how the various attributes (obtained from both the files and images) are spread across the entire dataset 2.in our project work we have implemented 3 base model Res-net mobile net and the Xception nets 3.in the inpy file we have impleted with 1 base model Resnet and the corresponding model accuracy has been capture as show in the above 4.The Resnet model is trained for 6 epochs since after 6 epoch the model model accuracy not yet improved hence we used early stopping and saved the model.

  1. the model performance can be improved by tuning the hyper parameters and training for more number of epochs.